{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 45.Spark编程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.1.导入pyspark包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.2 SparkSession及其创建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The entry point to programming Spark with the Dataset and DataFrame API.\\n\\n    A SparkSession can be used create :class:`DataFrame`, register :class:`DataFrame` as\\n    tables, execute SQL over tables, cache tables, and read parquet files.\\n    To create a SparkSession, use the following builder pattern:\\n\\n    >>> spark = SparkSession.builder \\\\\\n    ...     .master(\"local\") \\\\\\n    ...     .appName(\"Word Count\") \\\\\\n    ...     .config(\"spark.some.config.option\", \"some-value\") \\\\\\n    ...     .getOrCreate()\\n\\n    .. autoattribute:: builder\\n       :annotation:\\n    '"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SparkSession.__doc__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Builder',\n",
       " '__class__',\n",
       " '__delattr__',\n",
       " '__dict__',\n",
       " '__dir__',\n",
       " '__doc__',\n",
       " '__enter__',\n",
       " '__eq__',\n",
       " '__exit__',\n",
       " '__format__',\n",
       " '__ge__',\n",
       " '__getattribute__',\n",
       " '__gt__',\n",
       " '__hash__',\n",
       " '__init__',\n",
       " '__le__',\n",
       " '__lt__',\n",
       " '__module__',\n",
       " '__ne__',\n",
       " '__new__',\n",
       " '__reduce__',\n",
       " '__reduce_ex__',\n",
       " '__repr__',\n",
       " '__setattr__',\n",
       " '__sizeof__',\n",
       " '__str__',\n",
       " '__subclasshook__',\n",
       " '__weakref__',\n",
       " '_convert_from_pandas',\n",
       " '_createFromLocal',\n",
       " '_createFromRDD',\n",
       " '_create_from_pandas_with_arrow',\n",
       " '_create_shell_session',\n",
       " '_get_numpy_record_dtype',\n",
       " '_inferSchema',\n",
       " '_inferSchemaFromList',\n",
       " '_instantiatedSession',\n",
       " '_repr_html_',\n",
       " 'builder',\n",
       " 'catalog',\n",
       " 'conf',\n",
       " 'createDataFrame',\n",
       " 'newSession',\n",
       " 'range',\n",
       " 'read',\n",
       " 'readStream',\n",
       " 'sparkContext',\n",
       " 'sql',\n",
       " 'stop',\n",
       " 'streams',\n",
       " 'table',\n",
       " 'udf',\n",
       " 'version']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dir(SparkSession)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mySpark = SparkSession.builder\\\n",
    "    .appName('My_App')\\\n",
    "    .master('local')\\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "            <div>\n",
       "                <p><b>SparkSession - in-memory</b></p>\n",
       "                \n",
       "        <div>\n",
       "            <p><b>SparkContext</b></p>\n",
       "\n",
       "            <p><a href=\"http://2013-20150813LW:4040\">Spark UI</a></p>\n",
       "\n",
       "            <dl>\n",
       "              <dt>Version</dt>\n",
       "                <dd><code>v2.4.0</code></dd>\n",
       "              <dt>Master</dt>\n",
       "                <dd><code>local</code></dd>\n",
       "              <dt>AppName</dt>\n",
       "                <dd><code>My_App</code></dd>\n",
       "            </dl>\n",
       "        </div>\n",
       "        \n",
       "            </div>\n",
       "        "
      ],
      "text/plain": [
       "<pyspark.sql.session.SparkSession at 0x6e0e0b8>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mySpark"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.3 Spark数据抽象类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "myDF = mySpark.range(1,100).toDF(\"number\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame[number: bigint]\n"
     ]
    }
   ],
   "source": [
    "print(myDF)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- number: long (nullable = false)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "myDF.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    " #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[number: bigint]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "divisBy2 = myDF.where(\"number % 2 = 0\")\n",
    "divisBy2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "49"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "divisBy2.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(number=1), Row(number=2), Row(number=3), Row(number=4), Row(number=5)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[number: bigint]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF = mySpark.range(1,100).toDF(\"number\").where(\"number % 2 = 0\").sort(\"number\")\n",
    "myDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "== Physical Plan ==\n",
      "*(2) Sort [number#16L ASC NULLS FIRST], true, 0\n",
      "+- Exchange rangepartitioning(number#16L ASC NULLS FIRST, 200)\n",
      "   +- *(1) Project [id#14L AS number#16L]\n",
      "      +- *(1) Filter (((id#14L % 2) = 0) && ((id#14L % 5) = 0))\n",
      "         +- *(1) Range (100, 1, step=1, splits=1)\n"
     ]
    }
   ],
   "source": [
    "myDF=mySpark.range(100,1).toDF(\"number\").where(\"number % 2 = 0\").filter(\"number % 5 = 0\").sort(\"number\").explain()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.4. Spark DataFrame操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = mySpark.read.csv('flights.csv', header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- year: string (nullable = true)\n",
      " |-- month: string (nullable = true)\n",
      " |-- day: string (nullable = true)\n",
      " |-- dep_time: string (nullable = true)\n",
      " |-- dep_delay: string (nullable = true)\n",
      " |-- arr_time: string (nullable = true)\n",
      " |-- arr_delay: string (nullable = true)\n",
      " |-- carrier: string (nullable = true)\n",
      " |-- tailnum: string (nullable = true)\n",
      " |-- flight: string (nullable = true)\n",
      " |-- origin: string (nullable = true)\n",
      " |-- dest: string (nullable = true)\n",
      " |-- air_time: string (nullable = true)\n",
      " |-- distance: string (nullable = true)\n",
      " |-- hour: string (nullable = true)\n",
      " |-- minute: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[year: string, month: string, day: string, dep_time: string, dep_delay: string, arr_time: string, arr_delay: string, carrier: string, tailnum: string, flight: string, origin: string, dest: string, air_time: string, distance: string, hour: string, minute: string]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|\n",
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "|2014|    1|  1|       1|       96|     235|       70|     AS| N508AS|   145|   PDX| ANC|     194|    1542|   0|     1|\n",
      "|2014|    1|  1|       4|       -6|     738|      -23|     US| N195UW|  1830|   SEA| CLT|     252|    2279|   0|     4|\n",
      "|2014|    1|  1|       8|       13|     548|       -4|     UA| N37422|  1609|   PDX| IAH|     201|    1825|   0|     8|\n",
      "|2014|    1|  1|      28|       -2|     800|      -23|     US| N547UW|   466|   PDX| CLT|     251|    2282|   0|    28|\n",
      "|2014|    1|  1|      34|       44|     325|       43|     AS| N762AS|   121|   SEA| ANC|     201|    1448|   0|    34|\n",
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['year',\n",
       " 'month',\n",
       " 'day',\n",
       " 'dep_time',\n",
       " 'dep_delay',\n",
       " 'arr_time',\n",
       " 'arr_delay',\n",
       " 'carrier',\n",
       " 'tailnum',\n",
       " 'flight',\n",
       " 'origin',\n",
       " 'dest',\n",
       " 'air_time',\n",
       " 'distance',\n",
       " 'hour',\n",
       " 'minute']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "52535"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "spark_df_flights_selected = df.select(df['tailnum'], df['flight'],\n",
    "                                      df['dest'], df['arr_delay'],\n",
    "                                      df['dep_delay'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+----+---------+---------+\n",
      "|tailnum|flight|dest|arr_delay|dep_delay|\n",
      "+-------+------+----+---------+---------+\n",
      "| N508AS|   145| ANC|       70|       96|\n",
      "| N195UW|  1830| CLT|      -23|       -6|\n",
      "| N37422|  1609| IAH|       -4|       13|\n",
      "+-------+------+----+---------+---------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark_df_flights_selected.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df.createTempView('flights_view')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.5 SQL编程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sql_str = 'select dest, arr_delay from flights_view'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "spark_destDF = mySpark.sql(sql_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+---------+\n",
      "|dest|arr_delay|\n",
      "+----+---------+\n",
      "| ANC|       70|\n",
      "| CLT|      -23|\n",
      "| IAH|       -4|\n",
      "+----+---------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark_destDF.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tempfile\n",
    "tempfile.mkdtemp()\n",
    "spark_destDF.write.csv(\"spark.csv\",mode='overwrite')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+---+\n",
      "|_c0|_c1|\n",
      "+---+---+\n",
      "|ANC| 70|\n",
      "|CLT|-23|\n",
      "|IAH| -4|\n",
      "+---+---+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfnew = mySpark.read.csv('spark.csv')\n",
    "\n",
    "dfnew.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|\n",
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "|2014|    1|  1|     654|       -6|    1455|      -10|     DL| N686DA|   418|   SEA| JFK|     273|    2422|   6|    54|\n",
      "|2014|    1|  1|     708|       -7|    1510|      -19|     AA| N3DNAA|   236|   SEA| JFK|     281|    2422|   7|     8|\n",
      "|2014|    1|  1|     708|       -2|    1453|      -20|     DL| N3772H|  2258|   PDX| JFK|     267|    2454|   7|     8|\n",
      "+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+\n",
      "only showing top 3 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "jfkDF = df.filter(df['dest'] == 'JFK')\n",
    "jfkDF.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+------------------+\n",
      "|day|      avg(arr_delay)|    avg(dep_delay)|\n",
      "+---+--------------------+------------------+\n",
      "|  7|0.025215252152521524| 5.243243243243243|\n",
      "| 15|  1.0819155639571518| 4.818353236957888|\n",
      "| 11|   5.749170537491706| 7.250661375661376|\n",
      "| 29|   6.407451923076923| 11.32174955062912|\n",
      "|  3|   5.629350893697084|11.526241799437676|\n",
      "| 30|   9.433526011560694| 12.31663788140472|\n",
      "|  8|    0.52455919395466| 4.555904522613066|\n",
      "| 22| -1.0817571690054912|  6.10231425091352|\n",
      "| 28| -3.4050632911392404| 4.110270951480781|\n",
      "| 16| 0.31582125603864736|4.2917420132610005|\n",
      "|  5|    4.42015503875969| 8.219989696032973|\n",
      "| 31|   5.796638655462185| 6.382229673093042|\n",
      "| 18|  -0.235370611183355|3.0194931773879143|\n",
      "| 27|  -4.354777070063694| 4.864126984126984|\n",
      "| 17|  1.8664688427299703| 5.873815165876778|\n",
      "| 26| -1.5248683440608544| 4.833430742255991|\n",
      "|  6|  3.1785932721712538| 7.075045759609518|\n",
      "| 19|  2.8462462462462463| 7.208383233532934|\n",
      "| 23|   2.352836879432624| 6.307105108631826|\n",
      "| 25| -2.3858004018754184|3.4145527369826434|\n",
      "+---+--------------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dailyDelayDF = df.groupBy(df.day)\\\n",
    "                 .agg({'dep_delay': 'mean', 'arr_delay':'mean'})\n",
    "dailyDelayDF.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- day: string (nullable = true)\n",
      " |-- avg(arr_delay): double (nullable = true)\n",
      " |-- avg(dep_delay): double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dailyDelayDF.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.6. DataFrame的可视化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- day: string (nullable = true)\n",
      " |-- avg_arr_delay: double (nullable = true)\n",
      " |-- avg_dep_delay: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dailyDelayDF = dailyDelayDF.withColumnRenamed('avg(arr_delay)', 'avg_arr_delay')\n",
    "dailyDelayDF = dailyDelayDF.withColumnRenamed('avg(dep_delay)', 'avg_dep_delay')\n",
    "dailyDelayDF.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>day</th>\n",
       "      <th>avg_arr_delay</th>\n",
       "      <th>avg_dep_delay</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7</td>\n",
       "      <td>0.025215</td>\n",
       "      <td>5.243243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15</td>\n",
       "      <td>1.081916</td>\n",
       "      <td>4.818353</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11</td>\n",
       "      <td>5.749171</td>\n",
       "      <td>7.250661</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>29</td>\n",
       "      <td>6.407452</td>\n",
       "      <td>11.321750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>5.629351</td>\n",
       "      <td>11.526242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>30</td>\n",
       "      <td>9.433526</td>\n",
       "      <td>12.316638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>8</td>\n",
       "      <td>0.524559</td>\n",
       "      <td>4.555905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>22</td>\n",
       "      <td>-1.081757</td>\n",
       "      <td>6.102314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>28</td>\n",
       "      <td>-3.405063</td>\n",
       "      <td>4.110271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>16</td>\n",
       "      <td>0.315821</td>\n",
       "      <td>4.291742</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  day  avg_arr_delay  avg_dep_delay\n",
       "0   7       0.025215       5.243243\n",
       "1  15       1.081916       4.818353\n",
       "2  11       5.749171       7.250661\n",
       "3  29       6.407452      11.321750\n",
       "4   3       5.629351      11.526242\n",
       "5  30       9.433526      12.316638\n",
       "6   8       0.524559       4.555905\n",
       "7  22      -1.081757       6.102314\n",
       "8  28      -3.405063       4.110271\n",
       "9  16       0.315821       4.291742"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "local_dailyDelay = dailyDelayDF.toPandas()\n",
    "\n",
    "local_dailyDelay.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.text.Text at 0xa0e0eb8>"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhoAAAF5CAYAAADZMYNPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzt3XtwpXd93/H3F2FuTljEZYAGXPDqEpdMDRKBJeA1F00k\na4sDM52Es7uiKUNJSpl1t5NyCSTc2oRAwZgSKJmUq8LJQKED1GsJBIy3ARYnUiBcDEdSgDUQw4CU\npTEmEO2vf5yztqTV5RzpPOc5z3PerxnNSM959Oj78yPv+eh3eyKlhCRJUhbukXcBkiSpvAwakiQp\nMwYNSZKUGYOGJEnKjEFDkiRlxqAhSZIyY9CQJEmZMWhIkqTMGDQkSVJmDBqSJCkzXRE0IuKqiPho\nRHwnIs5HxLU7nPs/Guec6GSNkiSpdV0RNIBLgS8ALwS2ffhKRDwbeCLwnQ7VJUmS9uGeeRcAkFKa\nAWYAIiK2OicifgG4ARgHTnWuOkmStFfd0qOxo0b4eC/w+pTSrXnXI0mSmlOIoAG8FPhpSumteRci\nSZKa1xVDJzuJiFHgBPC4Fr7nQdSHWL4J/CSbyiRJKqX7AI8CZlNKP9zvxbo+aABPAR4C3LZu+kYf\n8KaI+I8ppcu3+J5x4M86VJ8kSWV0DHj/fi9ShKDxXuATm459vHH8Xdt8zzcBpqenueKKK7KrrAuc\nPHmS66+/Pu8yOqJX2mo7y8V2lksvtPPWW2/l+PHj0Hgv3a+uCBoRcSkwAFzosrg8Iq4EVlJKtwGr\nm87/GXB7Smlxm0v+BOCKK65gZGQko6q7w4EDB0rfxgt6pa22s1xsZ7n0Sjsb2jL1oCuCBvB44NPU\n99BIwBsbx98DPG+L87fda0OSJHWPrggaKaWbaWEFzDbzMiRJUpcpyvJWSZJUQAaNgqtUKnmX0DG9\n0lbbWS62s1x6pZ3tFCmVb7pDRIwA8/Pz8700aUeSpH1bWFhgdHQUYDSltLDf69mjIUmSMmPQkCRJ\nmTFoSJKkzBg0JElSZgwakiQpMwYNSZKUGYOGJEnKjEFDkiRlxqAhSZIyY9CQJEmZMWhIkqTMGDQk\nSVJmDBqSJCkzBg1JkpQZg4YkScqMQUOSJGXGoCFJkjJj0JAkSZkxaEiSpMwYNCRJUmYMGpIkKTMG\nDUmSlBmDhiRJyoxBQ5IkZcagIUmSMmPQkCRJmTFoSJKkzBg0JElSZgwakiQpMwYNSZKUGYOGJEnK\nzD3zLkCSpCKo1WosLy8zMDDA4OBg3uUUhj0akiTtYGVlhYmJIwwPDzM5OcnQ0BATE0dYXV3Nu7RC\nMGhIkrSDo0enmJs7A0wDZ4Fp5ubOUKkcz7myYnDoRJKkbdRqNWZnT1EPGccaR4+xtpaYnZ1icXHR\nYZRd2KMhSdI2lpeXG58d3vTK1QAsLS11tJ4iMmhIkrSNgwcPNj47vemVmwEYGBjoaD1FZNCQJGkb\nQ0NDjI9P0td3gvrwyW3ANH191zE+PumwSRO6ImhExFUR8dGI+E5EnI+Ia9e9ds+I+KOI+JuI+IfG\nOe+JiIfnWbMkqTdUq9OMjR0CpoDLgCnGxg5RrU7nXFkxdMtk0EuBLwD/E/jwptfuBzwWeDXwN0A/\n8BbgI8ATOlijJKkH9ff3MzNzI4uLiywtLbmPRou6ImiklGaAGYCIiE2v/QgYX38sIl4EfD4iHpFS\n+nbHCpUk9azBwUEDxh50xdDJHjwASMDf512IJEnaXuGCRkTcG3gd8P6U0j/kXY8kSdpeVwydNCsi\n7gl8kHpvxgt3O//kyZMcOHBgw7FKpUKlUsmmQEmSCqRarVKtVjccO3fuXFt/RqSU2nrB/YqI88Cz\nUkof3XT8Qsh4FPD0lNK2m8xHxAgwPz8/z8jISJblSpJUKgsLC4yOjgKMppQW9nu9QvRorAsZlwNP\n2ylkSJKk7tEVQSMiLgUGgAsrTi6PiCuBFeDvgA9RX+L6r4BLIuKhjfNWUko/63S9kiSpOV0RNIDH\nA5+mPvciAW9sHH8P9f0zntk4/oXG8Wh8/TQu3hdWkiR1ia4IGimlm9l5BUzhVsdIkiTfwCVJUoYM\nGpIkKTNdMXSi4qvVaiwvL/sMAEnSBvZoaF9WVlaYmDjC8PAwk5OTDA0NMTFxhNVVVyBLkgwa2qej\nR6eYmzsDTANngWnm5s5QqRzPuTJJUjdw6ER7VqvVmJ09RT1kHGscPcbaWmJ2dorFxUWHUSSpx9mj\noT1bXl5ufHZ40ytXA7C0tNTReiRJ3cegoT07ePBg47PNe6bdDMDAwEBH65EkdR+DhvZsaGiI8fFJ\n+vpOUB8+uQ2Ypq/vOsbHJx02kdT1arUaN910E4uLi3mXUloGDe1LtTrN2NghYAq4DJhibOwQ1ep0\nzpVJ0vZcMdc5TgbVvvT39zMzcyOLi4ssLS25j4akQti4Yu4wcJq5uRNUKseZmbkx5+rKxaChthgc\nHDRgSCoEV8x1lkMnkqSe4oq5zjJoSJJ6iivmOsugIUnqKa6Y6yyDhiSp57hirnOcDCpJ6jmumOsc\ng4YkqWe5Yi57Dp1IkqTMGDQkSVJmDBqSJCkzBg1JkpQZg4YkScqMQUOSJGXGoCFJkjJj0JAkSZkx\naEiSpMwYNCRJUmYMGpIkKTMGDUmSlBmDhiRJyoxBQ5IkZcagIUmSMmPQkCRJmTFoSJKkzBg0JElS\nZgwakiQpMwYNSZKUGYOGJEnKjEFDkiRlpiuCRkRcFREfjYjvRMT5iLh2i3NeExHfjYgfR8QnImIg\nj1olSVLzuiJoAJcCXwBeCKTNL0bES4AXAS8AngDcAcxGxL06WaQkSWrNPfMuACClNAPMAEREbHHK\ndcBrU0r/p3HOc4HvAc8CPtCpOiVJUmu6pUdjWxHxaOBhwCcvHEsp/Qj4PPCkvOqSJEm76/qgQT1k\nJOo9GOt9r/GaJEnqUkUIGpIkqaC6Yo7GLm4HAngoG3s1Hgr89U7fePLkSQ4cOLDhWKVSoVKptLtG\nSZIKp1qtUq1WNxw7d+5cW39GpHTRIo9cRcR54FkppY+uO/Zd4A0ppesbX9+feuh4bkrpg1tcYwSY\nn5+fZ2RkpEOVS5JUfAsLC4yOjgKMppQW9nu9rujRiIhLgQHqPRcAl0fElcBKSuk24M3AKyJiCfgm\n8Frg28BHcihXkiQ1qSuCBvB44NPUJ30m4I2N4+8BnpdSen1E3A94B/AA4P8C16SUfppHsZIkqTld\nETRSSjezy8TUlNKrgFd1oh5JktQerjqRJEmZ6YoeDfWeWq3G8vIyAwMDDA4O5l2OJCkj9mioo1ZW\nVpiYOMLw8DCTk5MMDQ0xMXGE1dXVvEuTJGXAoKGOOnp0irm5M8A0cBaYZm7uDJXK8ZwrkyRlwaET\ndUytVmN29hT1kHGscfQYa2uJ2dkpFhcXHUaRpJKxR0Mds7y83Pjs8KZXrgZgaWmpo/VIkrJn0FDH\nHDx4sPHZ6U2v3AzAwMBAR+uRJGXPoKGOGRoaYnx8kr6+E9SHT24Dpunru47x8UmHTSSphAwa6qhq\ndZqxsUPAFHAZMMXY2CGq1emcK5MkZcHJoOqo/v5+ZmZuZHFxkaWlJffRkKSSM2goF4ODgwYMSeoB\nDp1IkqTMGDQkSVJmDBqSJCkzBg1JkpQZg4YkScqMQUOSJGXGoCFJkjJj0JAkSZkxaEiSpMwYNCRJ\nUmYMGpIkKTMGDUmSlBmDhiRJykxLT2+NiP8M9LfwLd9OKb2ttZIkSVJZtPqY+CngRUA0ef4bAIOG\nJEk9qtWgsZZSOt3syRHRbCCRJEkl1GrQSBmfry5Sq9VYXl5mYGCAwcHBvMuRJBWQk0F1kZWVFSYm\njjA8PMzk5CRDQ0NMTBxhdXU179IkSQVj0NBFjh6dYm7uDDANnAWmmZs7Q6VyPOfKJElF0+rQyb0j\n4rlNnhs0P2lUXaJWqzE7e4p6yDjWOHqMtbXE7OwUi4uLDqNIkprWao/GfwXu2+THfYA/aFul6ojl\n5eXGZ4c3vXI1AEtLSx2tp9fUajVuuukmFhcX8y5Fktqi1R6NzwGXtHD+nS1eXzk7ePBg47PT3N2j\nAXAzAAMDA50uqSesrKxw9OhUozepbnx8kmp1mv7+VrauUTdxQrXUetC4Cfgsuw+JpMY5jwGesIe6\nlJOhoSHGxyeZmzvB2lqi3pNxM3191zE2Nuk/lhnZOC/mMHCaubkTVCrHmZm5Mefq1CqDo3S3VoPG\nnSml5zV7ckT8ZYvXVxeoVqepVI4zOzt117Gxsfo/kmo/58WUj8FRupv7aOgi/f39zMzcyOLiIktL\nS3b7ZqyZeTH+9y8Og6O0UatBQz1kcHDQfxA7wHkx5WJwlDZyHw0VQplXY1yYF9PXd4L6X8G3AdP0\n9V3H+LjzYopmY3Bcz+Co3pR10HAfDe1Lr+xSWq1OMzZ2iPpzCy8DphgbO+S8mAIyOEobtTp08q2I\n+FwL53+pxetLG/TKpDrnxZSLE6qlu0VK5ZuvGREjwPz8/DwjIyN5l6M9qtVqDA8Ps3FSHY2vp6jV\nar4Zt8A9HTrP4KgiWlhYYHR0FGA0pbSw3+u11KMRER8CHt7Ct3w1pfT81kra8ufeA3g19XebhwHf\nBd6dUvov+722upeT6trDPR3y44RqqfWhk8tTSo9r9uSIuKXF62/npcBvAc8Fvgo8Hnh3RPx9Sumt\nbfoZ6jKuxmiPXhl+ktSdst5Ho12eBHwkpTTT+PpsRBzFXUdLzV1K9889HSTlrSjLWz8LPCMiBgEi\n4krgycCpHb9LhedqjP3xIXmS8laUDbteB9wf+FpErFEPSC9PKf15vmUpa67G2B+HnyTlrShB4zeA\no8BzqM/ReCxwQ0R8N6X0vlwrU0c4qW5vHH6SlLdWg8alEfHOJs8N2rdh1+uBP0wpfbDx9Vci4lHA\ny4Btg8bJkyc5cODAhmOVSoVKpdKmsqTu554OkrZTrVapVqsbjp07d66tP6OlfTQi4nLgkhauf2dK\n6WzLVV38c38A/G5K6U/WHXsZ8G9SSr+4xfnuoyFt4vCTpGbkuo8G8ETg51s4//vAvoMG8DHgFRHx\nbeArwAhwEvjTNlxb6gkOP0nKQ6urTl4O/AT4xyY/frdNdb4I+F/AH1Ofo/F64O3A77fp+pIkKQOt\n9mj8LKX03mZPjogXtXj9LaWU7gD+U+NDkiQVRKs9Gq1u2FW+B6lIkqSmFWXDLkmSVEAGDUmSlJlW\n52hcEhGb9zLeTjv30ZAkSQXUatB4H3BNC+e/u8XrS5KkEmk1aFxPa70U51u8viRJKpFWg8ZXgG83\neW4A96O+yZckSepBrQaNO1JKT2/25Ij4yxavL0mSSsR9NCRJUmZc3ipJkjLT6tCJpC5Rq9VYXl72\naaySupo9GlLBrKysMDFxhOHhYSYnJxkaGmJi4girq6t5l1Z6tVqNm266icXFxbxLkQqj1R6Nn0bE\nZ1s4/wctXl/SLo4enWJu7gwwDRwGTjM3d4JK5TgzMzfmXF05rayscPToFLOzp+46Nj4+SbU6TX9/\nf46VSd2v1aBxC/CQFs5favH6knZQq9Uab3bTwLHG0WOsrSVmZ6dYXFx0GCUDhjtp71oNGoeBa2l+\n064PAr/X4s+Q9qXMcxeWl5cbn21+EsDVACwtLZWuzXkz3En702rQSCmls82eHBE+60Qd0wvd2wcP\nHmx8dpq73/QAbgZgYGCg0yWVnuFO2h/30VBpbOzePgtMMzd3hkrleM6Vtc/Q0BDj45P09Z2g3s7b\ngGn6+q5jfHzSN7wMbAx36xnupGa46kSlcKF7e23tLdT/0n8k9e7tG5idPVWqVQLV6jRjY4eAKeAy\nYIqxsUNUq9M5V1ZOhjtpf9xHQ6XQS93b/f39zMzcyOLiIktLS6Wci9JtqtVpKpXjzM5O3XVsbGzS\ncCc1odWgcd+I+P0mz3V+hjqmF+cuDA4OGjA6xHAn7V2rQeO3gPu2cP5si9eX9uRC9/bc3AnW1hL1\nnoyb6eu7jrExu7fVHoY7qXUtBY2U0ubZUCqQMi/7BLu3JakbOUejB/TCsk+we1uSupGrTnpALyz7\nXG9wcJBrrrnGkCFJXcAejZJzV0NJUp7s0Si5ZpZ9SuoOPh1WZWTQKDl3NZS638rKChMTRxgeHmZy\ncpKhoSEmJo6wurqad2nSvhk0Ss5dDaXu12vzqNRbDBo9wC2rpe7VS9vnqzc5GbQHuOxT6l69tH2+\nepNBo4e4q6HUfXpx+3z1FodOJClHzqNS2Rk0JClnzqMqJ5cr1zl0Ikk5cx5VufTKYx+aZY+GJHWJ\nLLfP96/rznG58kYGDUkqMTcD6yyXK1/MoCFJJeZf153lYx8uZtCQpJLyr+vO87EPFzNoSFJJ+dd1\n57lc+WIGDUkqKf+6zofLlTdyeaskFVCtVmN5eXnHpbAX/rqemzvB2lqi3pNxM3191zE21pt/XXeC\ny5U3KkyPRkT8s4h4X0T8ICJ+HBFfjIiRvOuSpE5qdRWJf13nJ8vlykVSiB6NiHgA8Bngk8A48ANg\nEHB9lqSesnEVyWHgNHNzJ6hUjjMzc+NF5/vXtfJWiKABvBQ4m1J6/rpj38qrGEnKw4VVJPWQceEB\nbMdYW0vMzk6xuLi4bYjwoYrKS1GGTp4J/FVEfCAivhcRCxHx/F2/S8qZuzGqnVxFoiIqStC4HPj3\nwNeBXwXeDrwlIqZyrUrahrsx5qfM4c5VJCqiSCnlXcOuIuIfgVtSSletO3YD8PiU0pO3OH8EmD98\n+DAHDhzY8FqlUqFSqWRdsnrcxMQR5ubONDZKqo+j9/WdYGzs0Jbj6Nq/XnmQ1d2/WzewcRWJv1tq\nXbVapVqtbjh27tw5Tp8+DTCaUlrY9w9JKXX9B/BN4E82Hftt4LZtzh8B0vz8fJI67etf/3oCEkwn\nSOs+3peAVKvV8i6xlMbHJ1Nf3wMb/93PJphOfX0PTOPjk3mX1lYrKytpfHyy8TtW/xgfn0wrKyt5\nl6aSmJ+fv/C7NZLa8B5elMmgnwGGNx0bxgmh6kLNjKM7Ka+99jNJsmhcRaKiKUrQuB74TES8DPgA\n8ETg+cC/y7UqaQsbx9GPrXvFcfSs9GK4cxWJiqIQk0FTSn8FPBuoAF8CXg5cl1L681wLk7ZQlmcd\nFGlSpZMkpe5ViKABkFI6lVL6lyml+6WUHpNSemfeNUnbKfJujEVcMVOWcCeVUWGChlQkF8bRa7Ua\np06dolarMTNzYyFWP2zcefIsMM3c3BkqleM5V7azIoc7qcyKMkdDKqSijaMXeVKlkySl7mTQkHSX\nMkyqLFq4k8rOoRNJd3FSpaR2M2hIuouTKiW1m0FD0gZ7mVRZpKWwkjrLORqSNmhlUmWvPF9E0t7Z\noyFpS4ODg1xzzTU7DpcUdSmspM6xR0PSnhR5KaykzrFHQ9KeNLMUVpIMGup5TmTcG5fCSmqGQUM9\nq4jP9OgmLoWV1AyDhnqWExn3z+eLSNqNk0HVk5zI2B4+X0TSbgwa6klleKZHN/H5IpK249CJepIT\nGSWpMwwa6klOZJSkzjBoqGc5kVGSsuccDfUsJzJKUvYMGup5TmSUpOwYNCRJHVWr1VheXm6qF7GV\nc9WdnKMhqePc9r03tbIbrzv3lodBQ1LH+ObR21rZjdede8vDoCGpY3zz6F0XduNdW3sL9d14H0l9\nN94bmJ09taF3q5Vz1f0MGpI6wjeP3tbMbrx7OVfdz6AhqSN88+htrezG68695WLQkNQRvnn0tlZ2\n43Xn3nIxaEgtcLXE3vnmoVZ243Xn3vJwHw2pCSsrKxw9OtV4tHzd+Pgk1eo0/f39OVZWLNXqNJXK\ncWZnp+46NjY26ZtHj2hlN1537i2PSCnlXUPbRcQIMD8/P8/IyEje5agEJiaOMDd3pjGR8TBwmr6+\nE4yNHWJm5sa8yysc3zyk7rWwsMDo6CjAaEppYb/Xs0dD2sWF1RL17v5jjaPHWFtLzM5Osbi46Jtl\ni9z2XeodztGQduFqCUnaO4OGtAtXS0jS3hk0pF24WkKS9s6gITXBpXb5cUmxVGxOBpWa4FK7znNJ\nsVQO9mhILRgcHOSaa64xZHSAD2CTysEeDUldxyXFUnnYoyGp67ikWCoPg4akruOSYu2Vk4e7TyGD\nRkS8NCLOR8Sb8q5FUvu5pFitWllZYWLiCMPDw0xOTjI0NMTExBFWV1fzLq3nFS5oRMQvAy8Avph3\nLZKy45JitcLJw92rUJNBI+LnqP8WPR/4vZzLkZQhlxSrWU4e7m5F69H4Y+BjKaVP5V2IpM5wSbF2\n4+Th7laYoBERzwEeC7ws71okSd3DycPdrRBBIyIeAbwZOJZS+lne9UiSuoeTh7tbpJTyrmFXEfFr\nwIeBNSAah/uA1Dh277SuIRExAswfPnyYAwcObLhWpVKhUql0pG5JUmesrq5SqRx3y/oWVatVqtXq\nhmPnzp3j9OnTAKMppYX9/oyiBI1LgX++6fC7gVuB16WUbt10/ggwPz8/z8jISGeKlCTlzsnD+7ew\nsMDo6Ci0KWgUYtVJSukO4Kvrj0XEHcAPN4cMSVLvGhwcNGB0mUIEjW10f1eMJBVQrVZjeXnZXgG1\nRWGDRkrp6XnXIEllsrKywtGjU85zUFsVYtWJJCl77q6pLBS2R0OS1D7urqms2KMhSXJ3TWXGoCFJ\ncndNZcagIUlyd01lxqAhSQKgWp1mbOwQMAVcBkwxNnaIanU658pUZE4GlSQB0N/fz8zMjS3vrum+\nG9qJQUOStEGzu2u674aa4dCJJGlP3HdDzbBHQ5LUMvfdULPs0ZAktcx9N9Qsg4YkqWXuu6FmGTQk\nSS1z3w01y6AhSdoT991QM5wMKknak73uu6HeYtCQJO1Ls/tuqDcZNCT1NHe1lLLlHA1JPWllZYWJ\niSMMDw8zOTnJ0NAQExNHWF1dzbs0qVQMGpJ6krtaSp3h0ImknuOullLn2KMhqee4q6XUOQYNST3H\nXS2lzjFoSOo57mopdY5BQ1JPcldLqTOcDCqpJ7mrpdQZBg1JPc1dLaVsOXQiSZIyY9CQJEmZMWhI\nkqTMGDQkSVJmDBqSJCkzBg1JkpQZg4YkScqMQUOSJGXGoCFJkjJj0JAkSZkxaEiSpMwYNCRJUmYM\nGpIkKTMGDUmSlBmDhiRJykwhgkZEvCwibomIH0XE9yLif0fEUN51dYNqtZp3CR3TK221neViO8ul\nV9rZToUIGsBVwH8HngiMAZcAH4+I++ZaVRfopV/6Xmmr7SwX21kuvdLOdrpn3gU0I6U0uf7riPhN\n4PvAKPAXedQkSZJ2V5Qejc0eACRgJe9CJEnS9goXNCIigDcDf5FS+mre9UiSpO0VYuhkk7cB/wJ4\n8g7n3Afg1ltv7UhBeTp37hwLCwt5l9ERvdJW21kutrNceqGd694779OO60VKqR3X6YiIeCvwTOCq\nlNLZHc47CvxZxwqTJKl8jqWU3r/fixQmaDRCxq8BV6eU/naXcx8EjAPfBH6SfXWSJJXGfYBHAbMp\npR/u92KFCBoR8TagAlwL1Na9dC6lZJCQJKlLFSVonKe+ymSzf5tSem+n65EkSc0pRNCQJEnFVLjl\nrZIkqTgMGpIkKTOlDxoR8c2IOL/uYy0iXpx3Xe0QEf8hIr4REXdGxJmI+OW8a2qniHjlpnt3PiIK\nv0lbRFwVER+NiO802nTtFue8JiK+GxE/johPRMRAHrXux27tjIh3bXF/T+VV7141+9DHot/TZtpZ\nhnsaEb8dEV+MiHONj89GxMSmcwp9L2H3drbzXpY+aFCfRPoK4KHAw4CHU39AW6FFxG8AbwReCTwO\n+CIwGxEPzrWw9vsyd9+7hwFPybectrgU+ALwQraY5BwRLwFeBLwAeAJwB/V7e69OFtkGO7az4SY2\n3t9KZ0prq10f+liSe9rswy2Lfk9vA14CjFB/ntangI9ExBVQmnsJu7SzoT33MqVU6g/gG8CJvOvI\noF1ngBvWfR3At4EX511bG9v4SmAh7zoybuN54NpNx74LnFz39f2BO4Ffz7veNrfzXcCH864tg7Y+\nuNHep5T8nm7VzrLe0x9SX+VYynu5TTvbdi97oUcD4KUR8YOIWIiI34mIvrwL2o+IuIR6Av3khWOp\n/psxBzwpr7oyMtjoel+OiOmIeGTeBWUpIh5N/S+H9ff2R8DnKd+9BXhqoxv+axHxtoh4YN4FtcGG\nhz6W+J5u93DL0tzTiLhHRDwHuB/w2bLey83tXPdSW+5lEZ910qobgAXq/zP8CvA66r8ov5NnUfv0\nYKAP+N6m498DhjtfTmbOAL8JfJ36kNergNMR8UsppTtyrCtLD6P+j/dW9/ZhnS8nUzcBH6Le63gQ\n+EPgVEQ8qRGcCydiy4c+lu6ebtNOKMk9jYhfAj5HfYfM/wc8O6X09Yh4EiW6l9u1s/Fy2+5lIYNG\nRPwh9bGl7STgipRSLaX05nXHvxwRPwXeEREvSyn9LNNCtS8ppdl1X345Im4BvgX8OvVuPRVYSukD\n6778SkR8CVgGngp8Opei9q+Zhz6WwZbtLNE9/RpwJXAA+NfAeyPicL4lZWLLdqaUvtbOe1nUoZP/\nBvziDh9XANs9D+UW6gHrUZlXmZ0fAGvUJ+ms91Dg9s6X0xkppXPUt6Av3AzvFtxOfb5NT91bgJTS\nN6j/bhfy/kb9eUyTwFNTSn+37qVS3dMd2nmRot7TlNI/pZT+NqX01ymll1OfbH8dJbuXO7Rzq3P3\nfC8LGTRSSj9s9Fbs9PFP23z746hPYPp+B0tuq0ZPzDzwjAvHGl2Zz2Dj+FqpRMTPUf8l3/EftyJr\n/M98Oxvv7f2pz/Qv7b0FiIhHAA+igPc37n7o49PSpidLl+me7tTObc4v7D3d5B7Avct0L7dxD+De\nW72wn3tZyKGTZkXEIeq/AJ+mPv70K8CbgPc1/jousjcB746Ieeq9NCepT+R5d55FtVNEvAH4GPXh\nkl8AXg38DKjmWdd+RcSl1ANTNA5dHhFXAisppduoj32/IiKWqD+B+LXUVxR9JIdy92yndjY+Xkl9\nDPj2xnl/RL3Havbiq3Wv2PjQxzsi4sJfu+sf+lj4e7pbOxv3u/D3NCL+gPr8hLPAzwPHgKuBX22c\nUvh7CTuI2aKbAAAByklEQVS3s+33Mu/lNFl+UO+9+Bz1f9TuoL4nw4uBS/KurU3teyH1X/Q7G+18\nfN41tbl9Ver/A9/Z+J/h/cCj866rDe26mnqv2tqmj3euO+dV1JfR/bjxP/ZA3nW3s53UJ5/NNP4R\n+wn1oc63Aw/Ju+49tHOrNq4Bz910XqHv6W7tLMs9Bf60UfudjbZ8HHh6me7lbu1s9730oWqSJCkz\nhZyjIUmSisGgIUmSMmPQkCRJmTFoSJKkzBg0JElSZgwakiQpMwYNSZKUGYOGJEnKjEFDkiRlptTP\nOpGUn8Zjtd9BfYvjuw43Pk4DTwDutfnbgEuBx6T6wwMlFZxBQ1JW7gtUU0qvWX8wIi6j/oCm8yml\nkc3fFBGf4u4HsUkqOIdOJHXabiHCkCGViEFDkiRlxqAhSZIyY9CQJEmZMWhIkqTMGDQkSVJmDBqS\nJCkzBg1JkpQZg4akbpPyLkBS+xg0JGVpL5tvuWGXVCJuQS4pK+eAIxFxZN2xoN5jMQsciIhbNn3P\nhdfPd6ZESVmLlOyllCRJ2XDoRJIkZcagIUmSMmPQkCRJmTFoSJKkzBg0JElSZgwakiQpMwYNSZKU\nGYOGJEnKjEFDkiRl5v8DSlm+pgDImzUAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x86e5630>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.scatter(local_dailyDelay.day.values.astype('i8'),\n",
    "            local_dailyDelay.avg_dep_delay.astype('f8'))\n",
    "plt.rcParams['font.family']=\"SimHei\" \n",
    "\n",
    "plt.xlabel('日期')\n",
    "plt.ylabel('起飞延误时间')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.lines.Line2D at 0xa14d438>"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhoAAAF1CAYAAACu80M0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzt3X18XVd54Pvfg5IQAolrSKAUEkhiWUkI6UViIANtCFwx\nchReShnayo5JO6W0pOB8DHNhLm2nLW3aoReaS5p2oISXUpcDzMDlpYmt1gMlDIMHRqINTANHVsgL\nkJIQGZMACbb83D/2USIpejmSztY5++j3/Xz08Tlr73POs7Ql+/Faaz8rMhNJkqQyPKLdAUiSpO5l\noiFJkkpjoiFJkkpjoiFJkkpjoiFJkkpjoiFJkkpjoiFJkkpjoiFJkkpjoiFJkkrT0YlGRJwaEbdE\nxBmz2l4aEZMRcSQixiOir50xSpKkxXVsohERpwKfAp4yq+0s4L3AG4GfAiaA69oSoCRJWlbHJhpA\nDfibeW3nAm/KzI9m5t3Afwaese6RSZKkphzX7gCW8KrMvC0irplpyMzr551zDsWohiRJ6kAdO6KR\nmbctdTwijgdeTzGqIUmSOlAnj2gs5y3AfcB7ljopIh4HDAG3AveXH5YkSV3jROCpwGhm3rOaN6hk\nohERLwBeAzw7M6eXOX2Ih6/1kCRJzdsBfHA1L6xcohERZ1J09orM/HoTL7kVYM+ePZx77rllhtZ2\nu3fv5uqrr253GKWzn93FfnYX+9ldbr75Zi677DJo/Fu6GpVKNCLiROBvgY8Dn4iIRwNk5g+WeNn9\nAOeeey79/f3lB9lGmzZt6vo+gv3sNvazu9jPrrXqpQcduxh0lpz1+N9Q3Gnya8D3gXuB788u6CVJ\nkjpHx49oZGbPrMefBHqWOF2SJHWQKoxoSJKkijLR6CIjIyPtDmFd2M/uYj+7i/3UfJGZy59VYRHR\nD4yNjY1ttIU7kiStyfj4OAMDAwADmTm+mvdwREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXG\nREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOS\nJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJXGREOSJJWmoxONiDg1Im6JiDNmtZ0fEV+M\niHsi4q3tjE+SJC2tYxONiDgV+BTwlFltJwCfBL4EPBM4LyIub0+EkiRpOR2baAA14G/mtQ0DpwBv\nyMxvAL8FvGq9A5MkSc3p5ETjVZl5LRCz2i4ADmTm/QCZeRNwXjuCkyRJy+vYRCMzb1ug+RTgG/Pa\njkbEpnUISZIkrdBx7Q5ghY4u0PYAcBJweKkX7t69m02b5uYjIyMjjIyMtC46SZIqqlarUavV5rQd\nPrzkP61Nicxc85uUKSKOAU/NzNsj4o3A0zLz8lnHDwFbMvOeRV7fD4yNjY3R39+/PkFLktQFxsfH\nGRgYABjIzPHVvEfHTp0s4kvAc2aeRMSZwAnAVNsikiRJi6paonEjcPKsW1rfDOzPTh+WkSRpg6rC\nGo0Hk4jMnI6IXwNqEfE2YBq4uF2BSZKkpXV8opGZPfOefyoizgIGKG51PdSeyCRJ0nI6PtFYSGbe\nBextdxySpI2nXq8zOTnJli1b6O3tbXc4Ha9qazQkSWqLqakptm27lL6+PoaHh9m6dSvbtl3KoUMO\nrC/FREOSpCZs376T/fsPAHuA24E97N9/gJGRy9ocWWer5NSJJEnrqV6vMzp6A0WSsaPRuoPp6WR0\ndCcTExNOoyzCEQ1JkpYxOTnZeHTRvCPPA+DgwYPrGk+VmGhIkrSMs88+u/HoxnlHPgvAli1b1jWe\nKjHRkCRpGVu3bmVoaJienl0U0yd3AHvo6bmSoaFhp02WYKIhSVITarU9DA5eCOwEzgB2Mjh4IbXa\nnjZH1tlcDCpJUhM2b97Mvn3XMzExwcGDB62j0SQTDUmSVqC3t9cEYwWcOpEkSaUx0ZAkSaUx0ZAk\nSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx\n0ZAkSaWpZKIREa+KiNsj4gcR8emIOLPdMUmSpIerXKIREWcBvwO8GOgDbgHe386YJEnSwiqXaADP\nAL6Qmf+Umd8E3guc3eaYJEnSAqqYaPwz8IKI+OmI2ARcAfxdm2OSJEkLOK7dAaxUZt4cER8Fvgwk\n8A3g2e2NSpIkLaRyIxoR8SzgRcCzgJ8APgTsbWtQkiRpQZUb0QB+CfhQZv6vxvPfjojXRMQFmXnT\nYi/avXs3mzZtmtM2MjLCyMhIiaFKklQNtVqNWq02p+3w4cNrft/IzDW/yXqKiP8X2JyZlzeenwJ8\nB3hOZn55gfP7gbGxsTH6+/vXN1hJkipsfHycgYEBgIHMHF/Ne1RxRONzwF9FxJcpEoxfA+4EFh3N\nkCRJ7VG5RCMzPxoR5wBXAk8EvgL8XGZOtzcySZI0X+USDYDMvAq4qt1xSJKkpVXurhNJklQdJhqS\nJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0\nJhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0JhqSJKk0x7U7AEmS2q1erzM5OcmW\nLVvo7e1tdzhdxRENSdKGNTU1xbZtl9LX18fw8DBbt25l27ZLOXToULtD6xomGpKkDWv79p3s338A\n2APcDuxh//4DjIxc1ubIuodTJ5KkDalerzM6egNFkrGj0bqD6elkdHQnExMTTqO0gCMakqQNaXJy\nsvHoonlHngfAwYMH1zWeblX5RCMi3hoRn2h3HJKkajn77LMbj26cd+SzAGzZsmVd4+lWlZ46iYgL\ngN8ALmh3LJKkatm6dStDQ8Ps37+L6emkGMn4LD09VzI4OOy0SYtUdkQjIgJ4F/CnmXlbu+ORJFVP\nrbaHwcELgZ3AGcBOBgcvpFbb0+bIukeVRzReA5wPvCsiXgzsy8wjbY5JklQhmzdvZt++65mYmODg\nwYPW0ShBJRONiHg08HvALcBTgFcCvx0RF2XmA+2MTZJUPb29vSYYJalkogG8HDgJuDgzD0VED/AV\nirGv69oamSRJelBVE40nAQcy8xBAZk5HxE3AokuEd+/ezaZNm+a0jYyMMDIyUmqg0nqwfLLUuary\n+1mr1ajVanPaDh8+vOb3jcxc85ust4jYCfxGZj53VtsXgA9m5p/NO7cfGBsbG6O/v3+dI5XKNTU1\nxfbtOxtFhwpDQ8PUanvYvHlzGyOT1A2/n+Pj4wwMDAAMZOb4at6jqnedXA+cFxGvjognRcQuiltc\nP9bmuKR1ZflkqXP5+1mo5NRJZk5FxDDwduBPgTuBV2Tmt9obmaA6w4RVZ/lkqXP5+/mQqo5okJlf\nyMznZOZjMrM3M29Y/lUqk7sgri/LJ0udy9/Ph1Q20VDncZhwfVk+Wepc/n4+xERDLTEzTDg9fQ3F\nMOHpFMOE72B09AYmJibaHGG11Ot19u7du+T3baZ8ck/PLork7g5gDz09VzI0ZPlkqZ38/XyIiYZa\nwmHC1ljp9JPlk6XO5e9noZKLQdV55g4T7ph1ZOMNE67F3Omni4Ab2b9/FyMjl7Fv3/UPO9/yyVLn\n8vezYKKhlnAXxLVbyyp1yydLnWuj/346daKWcZhwbZx+ktSNHNFQyzhMuDZOP0nqRiYaarmNPky4\nWk4/SepGTp1IHcTpJ0ndpqkRjYi4C7gfOLrMqT8E/joz37rWwKSNyOknSd2m2amTE4FLmjjv8cBH\nIuLqzPzx6sOSNjannyR1i2YTjaOZOQYQEY/MzAfmnzDTHhHXAI8CTDQkSdrgVrNG42sRMT3v6xjw\nzwCZ+VuZebi1YUqSpCpa7WLQk4BHUxRvP6nx5cLSLtXMvhuSJC1k2eQgInqA42c1ZWY+kJn3A8dm\nPc6yglR7uO27JGmtmhmFOA74dtmBqPO47bskaa2WXQzaWPjZFxEnNO4kiYh4b+PwaY3HUWaQWn9r\n2XdDkqQZyyYaEbEZeA8wBbwKeAvF7a7HgP85cxpF3WR1iWb23TDRkCQtp5nbW2sUUyz/ESAz31dq\nROoInbrvRr1eZ3Jy0kJWqhR/brWRNZNovDIz7wKIiM9RVAddbOHnccCJmfmsFsWnNum0fTempqbY\nvn1nYzqnMDQ0TK22h82bN69rLFKz/LmVmlujcdesp++hKDM+vcjpJwGPaUFc6gC12h5GRi5jdHTn\ng22Dg8Nt2Xdj7sLUi4Ab2b9/FyMjl7Fv3/XrHo/UDH9uJYjM7r4rNSL6gbGxsTH6+/vbHU4ltXvf\njXq9Tl9fH3MXptJ4vpN6ve5wtDqOP7fqBuPj4wwMDAAMZOb4at5jRUW2IuK1ETG0mg9SdfX29nLJ\nJZe07S/FZhamSp3Gn1up0OzuredR7F1yBvCjiDhjkVMT+FZmHmtRfFLHLkyVluLPrVRodlO1rwJH\nGo+nWXyNxgnArRFxTq7TnExE7AVqmfmB9fg8rb9OW5gqNcOfW6nQ1NRJZj4iMx9JcYvrL2TmyQt9\nAacATwDOKzHmB0XEDsCpnA2gVtvD4OCFwE6KgbWdDA5e2JaFqVKz/LmVmh/RWFBEnAk8ITMPQFFF\nNCLeDNzWiuCW+ezNwNuAr5X9WWq/zZs3s2/f9W1fmCqthD+30soTjUdQlCD/SeAq4DLgfcCBmRMy\n8y9aF96S3g58DHjUOn2eOkBvb69/Uaty/LnVRrbSRGNmJ9djjT/Pz8x13zs8Ip4PvAB4GnDten++\nJElqzkoTjc8D32kU8XplCfEsKyIeCbwT+I3M/EGE+7lJktSpmk40IuJuiltcj0bEMR4qQx6N9zke\neHZmlr0+4z8CX8zMfSt50e7du9m0adOctpGREUZGRloZmyRJlVSr1ajVanPaDh8+vOb3bboyaER8\nH3g+RWLx6cZjGs//HhgEvtLYSr40EXELcCoP3WJ7EsWtt+/PzNcucL6VQSVJWoVWVAZdydTJ0cwc\nA4iIBx83nh+Z/bxkP8PcuN8OfAF4/zp9viRJatJKEo1HR8QHFngMcHLj+fcyc1frwnu4zPz27OcR\ncS/w3cycKvNzJS3P7dAlzbeSROMPKKYojgJfprjz5BEUd6L878Z7/ajVAS4nM//den+mpLncDl3S\nYppONDLzD8sMRFJ1uR26pMWsaPfWGVHoazz+4qz2MyPiLa0KTlLr1et19u7dy8REa0rg1Ot1Rkdv\nYHr6GorNw04HdjA9/Q5GR29o2edIqqamEo2I+MWIeGnjaxvF5mkfbxw+c9ap/xZ4YYtjlNQCU1NT\nbNt2KX19fQwPD7N161a2bbuUQ4cOrel93Q5d0lKaHdF4N8XmZduAP6GopzGzm+sPACLiScC/B97c\n4hgltcDc6Y3bgT3s33+AkZHL1vS+c7dDn83t0CU1v0bjnsy8AiAibsrMjIijjWMZERcAHwbemZmf\nKSNQdSfvUlgfM9MbRZKxo9G6g+npZHR0JxMTE6v+/rsduqSlNDuisVBVr9Mj4k8pimd9CviDzPzd\nlkWmrlbWMP5srV6LUGVlT2+4HbqkxTSbaCy0ociPgQcaxzYBZ4cbj6hJZQ3jw/okMVVT9vTGzHbo\n9XqdG264gXq9zr5913trq6Q1jWjclZn/N3A38AzgJcCHWhWYulfZdymUmcRU1cz0Rk/PLorvyx3A\nHnp6rmRoqHXTG729vVxyySVOl0h6ULOJxlMi4tsRcSdARJxAsYkawHGZ+Q2KvU/Oj4grSohTXaTM\nYXxvtVyc0xuS2qHZxaAnU9xlkrNe89eNP0+JiFOAxwOvA/4mIq4re3M1VdfcYfwds46sfRi/mSRm\no/5ve2Z6Y2JigoMHD7oAV9K6WEnBrt8Cjs/M+zPzx5n5VoDM3AT8PMUOrrcAV5pkaCllDuN7q+Xy\nnN5oDRcbS81ZNtGIiJOAUYo6GidFxDkRcVdjKuXbEfFt4I+BJwMHgC8u8XYSUN4w/nqtRdDG5WJj\naWWamTp5JsWKul/NzPsj4icpNlH75XnnHQ9cmJm3tjRCdaUyh/FrtT2MjFzG6OjOB9sGB4ddi6CW\ncF8XaWWWTTQy80bmjkM/AHw+M29b4HRrDWtFent7Wz7K4FoElaXMwmdSt1rJNvEAZOZB4LcXOhYR\nkZkL3QorrbsykhhtbC42llZuVbu3ztdYxwFwQUTsb8V7SlKncbGxtHLN7t66MyIunfX8v0TEKxuP\njwdmVkHdRVFPQ5K6jouNpZVrdkTjl4F/Nev5KRRbxZOZR4D7G+3fhWIKpUXxSVJHsfCZtDLNrtH4\nIPCGiDjWeH4W8KKI+CmKvU6OQJF0RMQx12lI6lYuNpZWptlE4++APwdOazzf1/jztNknRcRxwA9b\nE5okda6qLjau1+tMTk6aIGndNJVoZOYdEXGYournsfnHI+IXGw+fTLFOQ5LUQaampti+fWfj9tzC\n0FBRX8ZddlWmldx1MgmcExH/OyK+GBH/EBGfjohPAz2Ncy4Evt7yKCVJa+KuxmqXldTR+DTFtMhb\nKRZ/Tjfae3go0XgZ8JmWRSdJWjMLjamdVjKicW1m3pqZH8jMj1DcdfKlzPxIZtYi4hzgEh7a1bU0\nEfHSiJiMiCMRMR4RfWV/ZjdxMyhpY2mm0JhUlpUkGt+IiHc3EgqAdwJ/HxH/HBG/D3wYeHNmlrpG\nIyLOAt4LvBH4KWACuK7Mz+wWbgYlbUwWGlM7rSTR+B7wNeD6iNgL3JmZfcBbgN8B+oBHtT7EhzkX\neFNmfjQz7wb+M/CMdfjcynOOVtqYLDSmdmq2MugJwNHMfDtFQvHfgW82RhfeDvw6xT/2l0fEx8os\n2JWZ12fm7BGMcyhGNbSEmTna6elrKOZoT6eYo30Ho6M3OI0idTkLjaldml0Megx4A0BmHgWuAq6K\niEcAz53ZGj4ingv80noV7GqUP3898Lb1+LwqczMobTTWi5jLQmNql2braBwFPrJA+zHg1llNx4B3\ntySy5rwFuA94zzp+ZiXNnaPdMeuIc7TqLtaLWFpVC42pula0TXxE/ArwL5m5d177q4Axij1QPg6U\n/tscES8AXgM8OzOnlzt/9+7dbNq0aU7byMgIIyMjJUXYWWbmaPfv38X0dFKMZHyWnp4rGRx0jlbd\nY+5apIuAG9m/fxcjI5exb9/1bY5O6ly1Wo1arTan7fDhw2t+31jJLEdEXAvUM/OaxvNHUJQm/1Vg\nF0UNjZuBExqjIKWIiDOBLwCvz8wPLnNuPzA2NjZGf39/WSFVwqFDhxgZucz/6alr1et1+vr6mFsv\ngsbzndTrdZNqaQXGx8cZGBgAGMjM8dW8x4pGNICjNAp1RcQpwH+lWBz6M5n5xYh4EpCs7G6WFYmI\nE4G/pRg5+UREPBogM39Q1md2C+do1e1ciyR1nmUTjYg4Fbg3Mx+gSCJm/C5wPNCfmfc02qbn/VmG\nf0Nxp8k5wK9R7B6bEXFmZt5e4ud2Dedo1a1ciyR1nmZGND4NPC0iZqZCDkXEEHAA2D4ryZh5vyPN\nrJlYrcz8JA+VPJekB7kWSeo8zUxxvBB4PPCTFHd3fJBi6uJCYDIifqexVgNgE3B3GYFKUjOsFyF1\nlmVHNDLzOzOPI+LHwGRm/iXwlxHxfwAfAp4XES8CzsLiWVJXqVo9CtciSZ1lpYtBj5/9msz8x4h4\nFsXdJu8Cvs/Di+lLqqCq16NwLZLUGZq+O6RRVvwTwOdnt2fm94EXAx8AXgH8ZSsDlNQe7o0jqRWa\nGtGIiH8NvB94SWZ+PSL+CJh/F8rlwO9n5rdbHqWkdTWzN87cehQ7mJ5ORkd3MjEx4WiBpKYsO6IR\nEa8A/p5iIWi90fx64FTgtFlf9wNbywlT0npqph6FJDWjmRGNMWA4M2+EB6dQIjNfN/ukiHgCcGNE\nPJCZ/6H1oUrVVbUFldajkNQqzdx1cgtwy6zn2RjlmH/edyLihZRYFVSqmqouqFyvehRVS8AkrVyz\nazQ+CXyPogT5TNvPLXDqD4Aac3d0lTasKm/wVavtaeyNs/PBtsHB4ZbUo6hqAiZp5Zq9vfV/8vDF\nnwu5GHgH8K/WEJPUFaq+oLLMehRVTsAkrUxTiUZmXtXMeRExDnw8Ih6RmcfWFJlUcd2ywVer61FU\nPQGTtDIrqaNxd0TcGhG3zPu6NSLubJz2v4AnmWRI8xdUzraxF1R6R4u0saykMujzKdZoJPBEYCa5\nCOD4iDg5M+9tcXxSZbnB18K8o0XaWJoa0YiImzPzq5n5NeBlwMwk6o+A52XmV4D3RcS/LSlOqZLc\n4OvhZhKwnp5dFNMndwB76Om5kqGhjZuASd2q2RGNJzbqZ3waOBe4uVEh9BnAz1HsczKBBbukOdzg\na2Fl3tEiqbM0m2gca9TPuI4i2fjSTDvFqAbA1ymmVyTN4wZfc5mASRtHs4lGRMRVwLsz886IODUi\npoAe4MTG4+OBfy4rUKnTWGxq7UzApO7X7F0nJwD9wOcbG6x9H3gR8BrgfwAvpJiEPreMIKVOMjU1\nxbZtl9LX18fw8DBbt25l27ZLOXToULtDk6SO02yicSQzL6HYofWjwMkUZclvBu7NzDFgFHh0RJy9\n+NtI1ef26ZLUvGZ2b30kcCJAZu6nGLn4EfA4ittd72oc+xFFCfKBsoKV2m2m2NT09DUUt2aeTlFs\n6h2Mjt7AxMREmyOUpM7SzKZqD9BINBrP/1tEPC4zZ8qRv2rW6f2ZabUdda1uqfZZVa6LkapnVTut\nzkoy5rebZKirWe2zPVwXI1WXW7pLK2CxqfZwXYxUXSYa0gpZ7XN9uS5GqraV7HXSMSLifOC9wNnA\ndZn5pjaHpA3EYlPry3UxUrVVbkQjIk4APklRnfSZwHkRcXl7o1I3qNfr7N27t+n/Iff29nLJJZf4\nj1zJXBcjVVvlEg1gGDgFeENmfgP4Lebe+SKtiAsNO5vrYqRqq2KicQFwIDPvB8jMm4Dz2huSqsyF\nhp3PdTFSdVVxjcYpwDfmtR2NiE2ZeXixF919993ceeedCx477rjjOO2005b80LvvvpujR48uevwx\nj3kMJ5988qLHjxw5wne/+90lP+PUU0/l+OOPX/T4vffey3333bfocfvxkGb7MTk5yejoDcCfAS9o\nHH0B09O/z+jo65iYmFj0f8yd1I+lVOl6LOVTn/o4t95666LrYqrSj265HvajsJH6sVpVTDQWuloP\nACcBiyYar371qznxxBPntJ1//vk8/elP57TTTuM3f/M3l/zQj3zkI9x9992LHr/44ou5+OKLFz1+\n6NAh3vWudy35GVdccQWPf/zjFz0+NjbGP/zDPyx63H48pNl+PLQeYxKY3a/iR2mphYad1I+lVOl6\nLOWKK65YchO2KvWjW66H/eiufjz2sY+lVqvNaT98eNF/VpsWi9Te6lgR8UbgaZl5+ay2Q8CWzLxn\ngfP7gbF9+/ZxwQUXLPieGykjtR+F2SMaP/uzP0sxovHyWWd8FHgd9XrdEY0N0o96vc74+Dinn346\nZ5111oLnVKEf0NrrMTk5yW233cZTn/rUOd+XqvVjMfajsFg/xsfHGRgYABjIzPElg1hMZlbqC3g+\nMDHr+ZkUe6zEIuf3Azk2NpbSQoaGhrOn57EJf51we8JfZ0/PY3NoaLjdoWkd3HPPPTk0NJzAg19D\nQ8M5NTXV7tDayu+LMjPHxsZmrn9/rvLf7SouBr0ROHnWLa1vBvZnVmxoRh3DhYYbm4uBF+b3Ra1S\nuakTgIh4MVCj2EV2Grg4M7+2yLn9wNjY2Bj9/f3rGKWqxgJcG0+9Xqevr4/iH9Mds47sAXYuOXXW\nzfy+aEYrpk6quBiUzPxURJxFsSX9gcy04IHWbKmFhupOVh1dmN8XtVIVp04AyMy7MnOvSYak1bLq\n6ML8vqiVKptoSNJaWXV0YX5f1EomGpI2NBcDL8zvi1qlkms0JKlV3I13YX5f1ComGpKEi4EX4/dF\na+XUiSRJKo0jGpLUYer1OpOTk05XqCs4oiFJHWJqaopt2y6lr6+P4eFhtm7dyrZtl3LokHfxq7pM\nNCSpQ1j2W93IqRNJ6gD1ep3R0RuYW/Z7B9PTyejoTiYmJhacRnGaRZ3OEQ1J6gDNlP2ezWkWVYWJ\nhiR1gJWW/XaaRVVhoiFJHWAlZb9nplmmp6+hmGY5nWKa5R2Mjt7AxMREW/ogLcREQ5I6RLNlv1c6\nzSK1k4tBJalDNFv2e+40y45ZR9xdVZ3HREOSOsxyZb9npln279/F9HRSjGR8lp6eKxkcdHdVdRan\nTiSpgtxdVVXhiIYkVVA37K5qDZCNwURDkiqsirurTk1NsX37zkaBssLQ0DC12h42b97cxshUBqdO\nJEnryhogG4sjGpKkdbPaUuuqLkc0JEnrxhogG4+JhiRp3ay01Lqqr+1TJxHxJmAbkEudBjwOeFFm\n3r4ugUmSWs4aIBtP2xMN4M7MfP5yJ0XES2Y9fjXwe8CpwOeBX8rM75QWoSSpZWq1PYyMXMbo6M4H\n2wYHh60B0qU6IdGIlZwXEc8Ffh/YDnwdqAFvo6haI0nqcN1QA0TN64REY6kpk/kC6AV+PTM/AxAR\n7wP+fRmBSZLKU8UaIFq5Tkg0ViIz8/3z2voA90SWJKkDVS3RmCMiHgv8OvBL7Y5FkiQ9XKUTDeDP\ngf+emX+33Im7d+9m06ZNc9pGRkYYGRkpKzZJkiqjVqtRq9XmtB0+fHjN7xuZK1ki0XoR8crM/EAT\n570U+PLM7a0RcTnwx8AFmfndJV7XD4yNjY3R39/fqrAlSep64+PjDAwMAAxk5vhq3qOSIxoR8Uzg\nHcCLl0oyJElSe1Up0UiAiDgN+CTwJ8B4RDwaIDN/0MbYJGnDc9t3LaQTSpCvqI4GMAI8AfgD4PvA\nvY0/JUltMDU1xbZtl9LX18fw8DBbt25l27ZLOXToULtDUwfohBGNkyPiMyxfgvwk4GWZeQ1wzbpE\nJkla1txt3y8CbmT//l2MjFzGvn3Xtzk6tVvbE43MvBa4tt1xSJJWzm3ftZxOmDqRJFWU275rOSYa\nkqRVc9sXmi0aAAAMu0lEQVR3LcdEQ5K0ajPbvvf07KKYPrkD2ENPz5UMDbntu0w0JElrVKvtYXDw\nQopNtM8AdjI4eKHbvgvogMWgkqRqc9t3LcVEQ5LUEm77roU4dSJJkkpjoiFJkkpjoiFJkkpjoiFJ\nkkpjoiFJkkpjoiFJkkpjoiFJkkpjHQ1JUtep1+tMTk5aPKwDOKIhSeoaU1NTbNt2KX19fQwPD7N1\n61a2bbuUQ4cOtTu0DctEQ5LUNbZv38n+/QcoNni7HdjD/v0HGBm5rM2RbVxOnUiSukK9Xmd09AaK\nJGNHo3UH09PJ6OhOJiYmnEZpA0c0JEldYXJysvHoonlHngfAwYMH1zUeFUw0JEld4eyzz248unHe\nkc8CsGXLlnWNRwUTDUlSV9i6dStDQ8P09OyimD65A9hDT8+VDA0NO23SJiYakqSuUavtYXDwQmAn\ncAawk8HBC6nV9rQ5so2r7YtBI+JNwDYglzoNeBzwosy8fV0CkyRVzubNm9m373omJiY4ePCgdTQ6\nQNsTDeDOzHz+cidFxEsWaDsOGAdem5nzJ+UkSRtUb2+vCUaH6ISpk1jDeW8CntbCWCRJUgt1QqKx\n1JTJoiKiF3gDcGtLo5EkSS3TCYnGar0T+GPgtnYHIkmSFlbJRCMifgU4BXgbzU+9SJKkddYJi0FX\nJCJOBf4IeGFmZkRzecbu3bvZtGnTnLaRkRFGRkZaH6QkSRVTq9Wo1Wpz2g4fPrzm961SopEUoxfv\nAK7LzK+u5MVXX301/f39pQQmSVLVLfSf7/HxcQYGBtb0vlWcOhkBXhcRhyLiEPAzwN9GxBvbHJck\nqUT1ep29e/cyMTHR7lC0AlUa0YBiVOOp89o+DFwN7Fv3aCRJpZuammL79p2NnVkLQ0PD1Gp72Lx5\ncxsjUzM6YURjRXU0MvP22V/Aj4B/yczvlxahJKlttm/fyf79Byj2L7kd2MP+/QcYGbmszZGpGZ0w\nonFyRHyG5UuQnwS8bP6BzHxBWYFJktqrXq83RjL2ADsarTuYnk5GR3cyMTFhBdAO1/ZEIzOvBa5t\ndxySpM4zOTnZeHTRvCPPA+DgwYMmGh2uE6ZOJEla0Nlnn914NH87q88CsGXLlnWNRytnoiFJ6lhb\nt25laGiYnp5dFNMndwB76Om5kqGhYUczKsBEQ5LU0Wq1PQwOXgjsBM4AdjI4eCG12p42R6ZmtH2N\nhiRJS9m8eTP79l3PxMQEBw8eZMuWLY5kVIiJhiSpEnp7e00wKsipE0mSVBoTDUmSVBoTDUmSVBoT\nDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmS\nVBoTDUmSVBoTDUmSVBoTDUmSVBoTDUmSVJq2JxoR8aaI+ExEfHqJr89ExE0RcUa74+1ktVqt3SGs\nC/vZXexnd7Gfmq/tiQZwZ2Y+PzNfsMTX84Hfnv/CiPhwRLyjDTF3pI3yg28/u4v97C72U/Md1+4A\ngFjNeRExDFwEbG15RJIkqSU6YUQjV/qCiDgJ+HPgP2Tmva0PSZIktUInJBqr8XvA8cB0RAxGRLOj\nIpIkaR11wtTJijQWhO4CvgicBewGvgm8dJGXnAhw8803r0t87XT48GHGx8fbHUbp7Gd3sZ/dxX52\nl1n/dp642veIzBXPXLRURLwyMz/QxHkvBb4MXA78KtCbmUci4jHAbcAvZub+BV63HfibFoctSdJG\nsiMzP7iaF1ZpRCMpFoQ+GdifmUcAMvO+iJgAtgAPSzSAUWAHcCtw//qEKklSVzgReCrFv6WrUqVE\nY2YdxjeBcx5sLNZnPBn41kIvysx7gFVlYZIkif+xlhdXKdHIxtd/Ab4UES+jWKexi6IfC41mSJKk\nNuqERGNFdTQy82sRMQL8IdALHARekpk/Kik+SZK0Sp2wGPS1wMtZup5GACcBL8vMb897/ZuAbU28\n/nHAizLz9rVFLEnaaCJiE9AH1DPze+2Opyyl9DMzK/0FvLLJ814CnNF4/EngWONrGvi7dvdjjd+D\n8ymmke4B3trueErs5zWzrtmxxi9C2+NqUd9OBW6Z+Rnt1uu6SD+77rpS3G4/CRwBxoG+brymS/Sz\nq64p8ArgEPBPwL3Ay7v0ei7WzzVdz6oW7JptNSXMB4CnAT8BbGbxGhwdLyJOoEicvgQ8EzgvIi5v\nb1SlGQAuobhuPwE8o73htEZEnAp8CnjKrLauu64L9bOhq65rRJwFvBd4I/BTwARwXeOafoouuaaL\n9bNxuGuuaUScQlGJ+mcy86eB1wL/TxdezwX72Ti8tuvZ7gyqBRlYsyMaLwXOoPiF+Fa7425h/38O\n+C5wYuP5BcDn2h1XCf3sAb4HnNTuWEro2983fqmneWjUreuu6yL97LrrClwKvGrW84uB+xp/B3XN\nNV2in111TSnuahyZ9fzpwOEuvJ6L9XPN17MbRjRW6lnAcRFxR0TcFxG1xpxUVV0AHMjM+wEy8ybg\nvPaGVIqnU5TM/6eI+GFE7I2I09sdVIu8KjOvZe6oWzde14X62XXXNTOvz8zrZjX1Ufxv/6fpomu6\nQD/PoehnV13TzPxmZtYAIuJ4imrU/x/ddz0X6ufHaMH13GiJRlD8MvwjxTDQs4EzgT9uZ1BrdArw\njXltRyuePC3kPOBrFMXXng4cBf6yrRG1SGbetkBz113XRfrZtdcVHvwL+w3AO+nCazqj0c/XU/Sz\nK69pRFwA3AkMUZRV6MrrOa+fV9KC69kJt7eup8zM/wT8p5mGiPi/gI8CV7QtqrU5ukDbAxR36Rxe\n51hKk0Xp2wcLr0XEFcA3IuIxmXlf+yIrjde1O67rWyimE64DrlrgeLdc0wf7mZnTdOE1zcybIuKF\nwNXAeyhKK8xX+es5v5+Z+QrWeD032ojGQu4CHtfIyKtoCjhtXtvJwI/bEMt6uovi5/eJ7Q6kJF7X\niouIFwCvoZj3nqZLr+kC/Zyva65pZn4Z+GXg5+nS6wlz+9lYJDrbiq/nRko0kqJi+Yci4rmz2p8D\nfCcbe6dU0Jco+gBARJwJnEDxS9A1IuJPGoXaZjyHYlHhHW0KqWxe1wprXK8PAldk5tcbzV13TRfq\nZ7dd04i4KCL+ZFbTEYpbPG+mi67nIv1M4HfXej032tQJwFeAqyNiN0U2+kcUt/RU1Y3AyRFxeWb+\nFfBmik3n2luJrfX+CfjDiPgOxc/tNcBfzSzE6kJe14qKiBOBvwU+DnwiIh7dOPQ5uuiaLtHPm+iu\na1oHXh0RdWAfxRTYKLAXeE+3XE8e3s8/pOjnGGu9nu2+paYFt+Rc3uR5M7e3HkcxX/p9io3Yfgt4\nRLv7scbvwYsp5kfvBv4FOKfdMZXUz6soisncDfwp8Kh2x9Ti/j1422c3X9cF+tlV15WiOOD0rK+Z\nQkdndNM1Xaaf3XZN/0/gqxS3eX4IeFyjvWuu5wL9/PCsfq7pera9BPlarbWEebeIiMdTFFU5kJmH\n2h2PWsPr2n28pt3F67m8yicakiSpc22kxaCSJGmdmWhIkqTSmGhIkqTSmGhIkqTSmGhIkqTSmGhI\nkqTSmGhIkqTSmGhIarmIeHFEHIuIHy/wdSQipiPi3bPOf2JEHJn1/D0RcVXj8URE3BIRX4mIOyLi\nv7WjT5JWZyPudSKpfNPArZl51kIHI+J9wNHG4/OBjxUP46bGKU8EjkXEbcD9wK7M/ExEXA78QunR\nS2oZRzQklWGh7cLnOwqQmV8F/jXww8y8IDMvoNhP4i+A91DsoRGN10ST7y2pQziiIakMzextMDth\nOAqcFBFfpEgmTgf+IjOnIyKBP4+I+4DHUWz6JKkiHNGQVIZmEo355/wQuLDx9V/nnfda4NnAH7Qk\nOknrxhENSWVI4IyIuGuBYwE8GnjnvPaTgH9sHP9J4M8a7T1AZuaxiDhWUrySSmKiIakMCdy+zGLQ\n+X7YWJ9BRPzZrPYTgF+IiGcAz2x5pJJKZaIhqQyx/CnFORFxPHD8Asd7IuKRwMnAz1Os43gU8LlW\nBSmpfCYaksrQdKIBXAlcDhyMiC/POvZI4DDwBOCMzPx24/bWl7c6WEnlcTGopDI083dLD0Bmvg24\niOJukucDg8AXgAuACeDuzPx2SXFKKpkjGpLK0MPyi0H/albbtcAPMvN7UFQKBa4DnszcO1AkVYyJ\nhqQy9LD0YtD30vj7JyJ+HngKcPGsU/4dcBXFra6/0jhvAPhl4PaygpbUepHZzO3uklSeiDg+M4/M\na+sFLsjMjzaePwl4A/D2zPxWG8KUtAomGpIkqTQuBpUkSaUx0ZAkSaUx0ZAkSaUx0ZAkSaUx0ZAk\nSaUx0ZAkSaUx0ZAkSaUx0ZAkSaX5/wGQzjAfAKgffAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0xa14d550>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.scatter(local_dailyDelay.day.values.astype('i8'),\n",
    "            local_dailyDelay.avg_arr_delay.values.astype('f8'))\n",
    "\n",
    "plt.xlabel('日期')\n",
    "plt.ylabel('到达延误时间')\n",
    "\n",
    "plt.axhline(0, color='black', linestyle='--', alpha=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mySpark.stop()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 45.7 Spark机器学习"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "mySpark = SparkSession.builder\\\n",
    ".appName('My_LR')\\\n",
    ".master('local')\\\n",
    ".getOrCreate()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[_c0: int, height: int, weight: int]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF=mySpark.read.format(\"csv\")\\\n",
    ".option(\"inferSchema\", \"true\")\\\n",
    ".option(\"header\", \"true\")\\\n",
    ".load(\"women.csv\")\n",
    "myDF\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(_c0=1, height=58, weight=115),\n",
       " Row(_c0=2, height=59, weight=117),\n",
       " Row(_c0=3, height=60, weight=120)]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- height: integer (nullable = true)\n",
      " |-- weight: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "myDF.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>summary</th>\n",
       "      <td>count</td>\n",
       "      <td>mean</td>\n",
       "      <td>stddev</td>\n",
       "      <td>min</td>\n",
       "      <td>max</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>_c0</th>\n",
       "      <td>15</td>\n",
       "      <td>8.0</td>\n",
       "      <td>4.47213595499958</td>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>height</th>\n",
       "      <td>15</td>\n",
       "      <td>65.0</td>\n",
       "      <td>4.47213595499958</td>\n",
       "      <td>58</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weight</th>\n",
       "      <td>15</td>\n",
       "      <td>136.73333333333332</td>\n",
       "      <td>15.498694261437752</td>\n",
       "      <td>115</td>\n",
       "      <td>164</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             0                   1                   2    3    4\n",
       "summary  count                mean              stddev  min  max\n",
       "_c0         15                 8.0    4.47213595499958    1   15\n",
       "height      15                65.0    4.47213595499958   58   72\n",
       "weight      15  136.73333333333332  15.498694261437752  115  164"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF.describe().toPandas().transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(_c0=1, height=58, weight=115),\n",
       " Row(_c0=2, height=59, weight=117),\n",
       " Row(_c0=3, height=60, weight=120)]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "myDF.take(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(_c0=1, height=58, weight=115, features=DenseVector([58.0])),\n",
       " Row(_c0=2, height=59, weight=117, features=DenseVector([59.0])),\n",
       " Row(_c0=3, height=60, weight=120, features=DenseVector([60.0]))]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.ml.feature import VectorAssembler\n",
    "vectorAssembler = VectorAssembler(inputCols = ['height'], outputCol = 'features')\n",
    "v_myDF = vectorAssembler.transform(myDF)\n",
    "v_myDF.take(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(features=DenseVector([58.0]), weight=115),\n",
       " Row(features=DenseVector([59.0]), weight=117),\n",
       " Row(features=DenseVector([60.0]), weight=120)]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v_myDF = v_myDF.select(['features', 'weight'])\n",
    "v_myDF.take(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df=v_myDF\n",
    "test_df=v_myDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.regression import LinearRegression\n",
    "myModel = LinearRegression(featuresCol = 'features', labelCol='weight')\n",
    "myResults = myModel.fit(train_df)\n",
    "myModel = LinearRegression(featuresCol = 'features', labelCol='weight', maxIter=10, regParam=0.3, elasticNetParam=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Coefficients: [3.45]\n",
      "Intercept: -87.51666666666614\n"
     ]
    }
   ],
   "source": [
    "print(\"Coefficients: \" + str(myResults.coefficients))\n",
    "print(\"Intercept: \" + str(myResults.intercept))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "summary = myResults.summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'NoneType' object has no attribute 'setCallSite'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-56-8cdc2f1775aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msummary\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresiduals\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m15\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32mC:\\Users\\Administrator\\Anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mtake\u001b[0;34m(self, num)\u001b[0m\n\u001b[1;32m    569\u001b[0m         \u001b[1;33m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Alice'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Bob'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    570\u001b[0m         \"\"\"\n\u001b[0;32m--> 571\u001b[0;31m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlimit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnum\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    572\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    573\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0msince\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1.3\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\Users\\Administrator\\Anaconda3\\lib\\site-packages\\pyspark\\sql\\dataframe.py\u001b[0m in \u001b[0;36mcollect\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    530\u001b[0m         \u001b[1;33m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Alice'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mRow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34mu'Bob'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    531\u001b[0m         \"\"\"\n\u001b[0;32m--> 532\u001b[0;31m         \u001b[1;32mwith\u001b[0m \u001b[0mSCCallSiteSync\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcss\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    533\u001b[0m             \u001b[0msock_info\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollectToPython\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    534\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_load_from_socket\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msock_info\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mBatchedSerializer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mPickleSerializer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\Users\\Administrator\\Anaconda3\\lib\\site-packages\\pyspark\\traceback_utils.py\u001b[0m in \u001b[0;36m__enter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     70\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__enter__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mSCCallSiteSync\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_spark_stack_depth\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_context\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jsc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetCallSite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call_site\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     73\u001b[0m         \u001b[0mSCCallSiteSync\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_spark_stack_depth\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m     74\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'setCallSite'"
     ]
    }
   ],
   "source": [
    "summary.residuals.take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9910098326857506"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.r2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.419702629269787"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.rootMeanSquaredError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+------+------------------+\n",
      "|features|weight|        prediction|\n",
      "+--------+------+------------------+\n",
      "|  [58.0]|   115|112.58333333333334|\n",
      "|  [59.0]|   117|116.03333333333336|\n",
      "|  [60.0]|   120|119.48333333333335|\n",
      "|  [61.0]|   123|122.93333333333334|\n",
      "|  [62.0]|   126|126.38333333333333|\n",
      "|  [63.0]|   129|129.83333333333331|\n",
      "|  [64.0]|   132| 133.2833333333333|\n",
      "|  [65.0]|   135| 136.7333333333333|\n",
      "|  [66.0]|   139|140.18333333333328|\n",
      "|  [67.0]|   142|143.63333333333327|\n",
      "|  [68.0]|   146|147.08333333333326|\n",
      "|  [69.0]|   150|150.53333333333325|\n",
      "|  [70.0]|   154|153.98333333333326|\n",
      "|  [71.0]|   159|157.43333333333325|\n",
      "|  [72.0]|   164|160.88333333333324|\n",
      "+--------+------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions = myResults.transform(test_df)\n",
    "predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|        prediction|\n",
      "+------------------+\n",
      "|112.58333333333334|\n",
      "|116.03333333333336|\n",
      "|119.48333333333335|\n",
      "|122.93333333333334|\n",
      "|126.38333333333333|\n",
      "|129.83333333333331|\n",
      "| 133.2833333333333|\n",
      "| 136.7333333333333|\n",
      "|140.18333333333328|\n",
      "|143.63333333333327|\n",
      "|147.08333333333326|\n",
      "|150.53333333333325|\n",
      "|153.98333333333326|\n",
      "|157.43333333333325|\n",
      "|160.88333333333324|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "predictions.select(\"prediction\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mySpark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
